In [66]:
data_dir = './downloads'
In [67]:
import os
os.listdir(data_dir)
import pandas as pd
hotel_raw_df = pd.read_csv('downloads/hotel_bookings.csv')
In [68]:
#Data Preparation and Cleaning
In [69]:
hotel_raw_df
Out[69]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.00 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.00 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.00 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.00 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.00 0 1 Check-Out 2015-07-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
119385 City Hotel 0 23 2017 August 35 30 2 5 2 ... No Deposit 394.0 NaN 0 Transient 96.14 0 0 Check-Out 2017-09-06
119386 City Hotel 0 102 2017 August 35 31 2 5 3 ... No Deposit 9.0 NaN 0 Transient 225.43 0 2 Check-Out 2017-09-07
119387 City Hotel 0 34 2017 August 35 31 2 5 2 ... No Deposit 9.0 NaN 0 Transient 157.71 0 4 Check-Out 2017-09-07
119388 City Hotel 0 109 2017 August 35 31 2 5 2 ... No Deposit 89.0 NaN 0 Transient 104.40 0 0 Check-Out 2017-09-07
119389 City Hotel 0 205 2017 August 35 29 2 7 2 ... No Deposit 9.0 NaN 0 Transient 151.20 0 2 Check-Out 2017-09-07

119390 rows × 32 columns

In [70]:
hotel_raw_df.shape
Out[70]:
(119390, 32)
In [71]:
hotel_raw_df.columns
Out[71]:
Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')
In [72]:
hotel_raw_df.describe()
Out[72]:
is_canceled lead_time arrival_date_year arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled booking_changes agent company days_in_waiting_list adr required_car_parking_spaces total_of_special_requests
count 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119386.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 103050.000000 6797.000000 119390.000000 119390.000000 119390.000000 119390.000000
mean 0.370416 104.011416 2016.156554 27.165173 15.798241 0.927599 2.500302 1.856403 0.103890 0.007949 0.031912 0.087118 0.137097 0.221124 86.693382 189.266735 2.321149 101.831122 0.062518 0.571363
std 0.482918 106.863097 0.707476 13.605138 8.780829 0.998613 1.908286 0.579261 0.398561 0.097436 0.175767 0.844336 1.497437 0.652306 110.774548 131.655015 17.594721 50.535790 0.245291 0.792798
min 0.000000 0.000000 2015.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 6.000000 0.000000 -6.380000 0.000000 0.000000
25% 0.000000 18.000000 2016.000000 16.000000 8.000000 0.000000 1.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9.000000 62.000000 0.000000 69.290000 0.000000 0.000000
50% 0.000000 69.000000 2016.000000 28.000000 16.000000 1.000000 2.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 179.000000 0.000000 94.575000 0.000000 0.000000
75% 1.000000 160.000000 2017.000000 38.000000 23.000000 2.000000 3.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 229.000000 270.000000 0.000000 126.000000 0.000000 1.000000
max 1.000000 737.000000 2017.000000 53.000000 31.000000 19.000000 50.000000 55.000000 10.000000 10.000000 1.000000 26.000000 72.000000 21.000000 535.000000 543.000000 391.000000 5400.000000 8.000000 5.000000
In [73]:
hotel_raw_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal                            119390 non-null  object 
 13  country                         118902 non-null  object 
 14  market_segment                  119390 non-null  object 
 15  distribution_channel            119390 non-null  object 
 16  is_repeated_guest               119390 non-null  int64  
 17  previous_cancellations          119390 non-null  int64  
 18  previous_bookings_not_canceled  119390 non-null  int64  
 19  reserved_room_type              119390 non-null  object 
 20  assigned_room_type              119390 non-null  object 
 21  booking_changes                 119390 non-null  int64  
 22  deposit_type                    119390 non-null  object 
 23  agent                           103050 non-null  float64
 24  company                         6797 non-null    float64
 25  days_in_waiting_list            119390 non-null  int64  
 26  customer_type                   119390 non-null  object 
 27  adr                             119390 non-null  float64
 28  required_car_parking_spaces     119390 non-null  int64  
 29  total_of_special_requests       119390 non-null  int64  
 30  reservation_status              119390 non-null  object 
 31  reservation_status_date         119390 non-null  object 
dtypes: float64(4), int64(16), object(12)
memory usage: 29.1+ MB
In [74]:
# It appears that a few columns contain some empty values since the Non-Null count for a few columns is lower than the total number of rows (119390). We’ll need to deal with empty values and manually adjust the data type for each column on a case-by-case basis.
In [75]:
#checking for null values 
null_values = pd.DataFrame({'Null Values' : hotel_raw_df.isna().sum(), 'Percentage Null Values' : (hotel_raw_df.isna().sum()) / (hotel_raw_df.shape[0]) * (100)})
null_values
Out[75]:
Null Values Percentage Null Values
hotel 0 0.000000
is_canceled 0 0.000000
lead_time 0 0.000000
arrival_date_year 0 0.000000
arrival_date_month 0 0.000000
arrival_date_week_number 0 0.000000
arrival_date_day_of_month 0 0.000000
stays_in_weekend_nights 0 0.000000
stays_in_week_nights 0 0.000000
adults 0 0.000000
children 4 0.003350
babies 0 0.000000
meal 0 0.000000
country 488 0.408744
market_segment 0 0.000000
distribution_channel 0 0.000000
is_repeated_guest 0 0.000000
previous_cancellations 0 0.000000
previous_bookings_not_canceled 0 0.000000
reserved_room_type 0 0.000000
assigned_room_type 0 0.000000
booking_changes 0 0.000000
deposit_type 0 0.000000
agent 16340 13.686238
company 112593 94.306893
days_in_waiting_list 0 0.000000
customer_type 0 0.000000
adr 0 0.000000
required_car_parking_spaces 0 0.000000
total_of_special_requests 0 0.000000
reservation_status 0 0.000000
reservation_status_date 0 0.000000
In [76]:
# check for missing values

hotel_raw_df.isnull().sum()
Out[76]:
hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company                           112593
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
reservation_status_date                0
dtype: int64
In [77]:
filter = (hotel_raw_df.children == 0) & (hotel_raw_df.adults == 0) & (hotel_raw_df.babies == 0)
hotel_raw_df[~filter]
Out[77]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.00 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... No Deposit NaN NaN 0 Transient 0.00 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 ... No Deposit NaN NaN 0 Transient 75.00 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 ... No Deposit 304.0 NaN 0 Transient 75.00 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 ... No Deposit 240.0 NaN 0 Transient 98.00 0 1 Check-Out 2015-07-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
119385 City Hotel 0 23 2017 August 35 30 2 5 2 ... No Deposit 394.0 NaN 0 Transient 96.14 0 0 Check-Out 2017-09-06
119386 City Hotel 0 102 2017 August 35 31 2 5 3 ... No Deposit 9.0 NaN 0 Transient 225.43 0 2 Check-Out 2017-09-07
119387 City Hotel 0 34 2017 August 35 31 2 5 2 ... No Deposit 9.0 NaN 0 Transient 157.71 0 4 Check-Out 2017-09-07
119388 City Hotel 0 109 2017 August 35 31 2 5 2 ... No Deposit 89.0 NaN 0 Transient 104.40 0 0 Check-Out 2017-09-07
119389 City Hotel 0 205 2017 August 35 29 2 7 2 ... No Deposit 9.0 NaN 0 Transient 151.20 0 2 Check-Out 2017-09-07

119210 rows × 32 columns

In [78]:
hotel_raw_df.sample(10)
Out[78]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
95587 City Hotel 0 82 2016 August 34 19 0 2 3 ... No Deposit 9.0 NaN 0 Transient 150.30 0 3 Check-Out 2016-08-21
34077 Resort Hotel 0 115 2017 March 10 9 0 3 2 ... No Deposit NaN NaN 0 Transient 65.00 0 1 Check-Out 2017-03-12
3913 Resort Hotel 1 72 2016 February 6 6 2 1 2 ... No Deposit NaN NaN 0 Transient-Party 73.33 0 0 Canceled 2016-01-19
23458 Resort Hotel 0 15 2016 April 17 23 2 1 2 ... No Deposit 240.0 NaN 0 Transient 93.67 0 1 Check-Out 2016-04-26
24945 Resort Hotel 0 8 2016 June 24 8 0 2 2 ... No Deposit NaN 154.0 0 Transient 66.00 0 0 Check-Out 2016-06-10
4750 Resort Hotel 0 169 2016 March 14 31 0 3 2 ... Refundable NaN 223.0 0 Transient-Party 66.00 0 0 Check-Out 2016-04-03
24483 Resort Hotel 0 165 2016 May 22 22 1 0 2 ... No Deposit 251.0 NaN 0 Transient 59.00 0 1 Check-Out 2016-05-23
43259 City Hotel 0 56 2015 September 38 17 0 2 2 ... No Deposit 20.0 NaN 0 Transient-Party 82.00 0 0 Check-Out 2015-09-19
33976 Resort Hotel 0 191 2017 February 8 23 4 10 2 ... No Deposit 96.0 NaN 0 Transient 47.00 0 1 Check-Out 2017-03-09
3214 Resort Hotel 0 78 2015 November 47 21 0 1 2 ... No Deposit NaN 342.0 0 Transient-Party 32.00 0 0 Check-Out 2015-11-22

10 rows × 32 columns

In [79]:
# Exploratory Analysis and Visualization
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
In [80]:
hotel_raw_df.country
Out[80]:
0         PRT
1         PRT
2         GBR
3         GBR
4         GBR
         ... 
119385    BEL
119386    FRA
119387    DEU
119388    GBR
119389    DEU
Name: country, Length: 119390, dtype: object
In [81]:
# the number of countries from which the total bookings are made
hotel_raw_df.country.nunique()
Out[81]:
177
In [82]:
top_countries_with_codes = hotel_raw_df.country.value_counts().head(10)
top_countries_with_codes
Out[82]:
PRT    48590
GBR    12129
FRA    10415
ESP     8568
DEU     7287
ITA     3766
IRL     3375
BEL     2342
BRA     2224
NLD     2104
Name: country, dtype: int64
In [83]:
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top 10 countries from where these hotels are recieving guests')
sns.barplot(x=top_countries_with_codes.index, y=top_countries_with_codes);
In [84]:
url= 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv'
country_code = pd.read_csv(url)
country_code
Out[84]:
name alpha-2 alpha-3 country-code iso_3166-2 region sub-region intermediate-region region-code sub-region-code intermediate-region-code
0 Afghanistan AF AFG 4 ISO 3166-2:AF Asia Southern Asia NaN 142.0 34.0 NaN
1 Åland Islands AX ALA 248 ISO 3166-2:AX Europe Northern Europe NaN 150.0 154.0 NaN
2 Albania AL ALB 8 ISO 3166-2:AL Europe Southern Europe NaN 150.0 39.0 NaN
3 Algeria DZ DZA 12 ISO 3166-2:DZ Africa Northern Africa NaN 2.0 15.0 NaN
4 American Samoa AS ASM 16 ISO 3166-2:AS Oceania Polynesia NaN 9.0 61.0 NaN
... ... ... ... ... ... ... ... ... ... ... ...
244 Wallis and Futuna WF WLF 876 ISO 3166-2:WF Oceania Polynesia NaN 9.0 61.0 NaN
245 Western Sahara EH ESH 732 ISO 3166-2:EH Africa Northern Africa NaN 2.0 15.0 NaN
246 Yemen YE YEM 887 ISO 3166-2:YE Asia Western Asia NaN 142.0 145.0 NaN
247 Zambia ZM ZMB 894 ISO 3166-2:ZM Africa Sub-Saharan Africa Eastern Africa 2.0 202.0 14.0
248 Zimbabwe ZW ZWE 716 ISO 3166-2:ZW Africa Sub-Saharan Africa Eastern Africa 2.0 202.0 14.0

249 rows × 11 columns

In [85]:
country_code['country'] = country_code['alpha-3'].str.replace('"','')
country_code
Out[85]:
name alpha-2 alpha-3 country-code iso_3166-2 region sub-region intermediate-region region-code sub-region-code intermediate-region-code country
0 Afghanistan AF AFG 4 ISO 3166-2:AF Asia Southern Asia NaN 142.0 34.0 NaN AFG
1 Åland Islands AX ALA 248 ISO 3166-2:AX Europe Northern Europe NaN 150.0 154.0 NaN ALA
2 Albania AL ALB 8 ISO 3166-2:AL Europe Southern Europe NaN 150.0 39.0 NaN ALB
3 Algeria DZ DZA 12 ISO 3166-2:DZ Africa Northern Africa NaN 2.0 15.0 NaN DZA
4 American Samoa AS ASM 16 ISO 3166-2:AS Oceania Polynesia NaN 9.0 61.0 NaN ASM
... ... ... ... ... ... ... ... ... ... ... ... ...
244 Wallis and Futuna WF WLF 876 ISO 3166-2:WF Oceania Polynesia NaN 9.0 61.0 NaN WLF
245 Western Sahara EH ESH 732 ISO 3166-2:EH Africa Northern Africa NaN 2.0 15.0 NaN ESH
246 Yemen YE YEM 887 ISO 3166-2:YE Asia Western Asia NaN 142.0 145.0 NaN YEM
247 Zambia ZM ZMB 894 ISO 3166-2:ZM Africa Sub-Saharan Africa Eastern Africa 2.0 202.0 14.0 ZMB
248 Zimbabwe ZW ZWE 716 ISO 3166-2:ZW Africa Sub-Saharan Africa Eastern Africa 2.0 202.0 14.0 ZWE

249 rows × 12 columns

In [86]:
country_code['country'] = country_code['country'].str.replace(' ','')
country_code['country']
Out[86]:
0      AFG
1      ALA
2      ALB
3      DZA
4      ASM
      ... 
244    WLF
245    ESH
246    YEM
247    ZMB
248    ZWE
Name: country, Length: 249, dtype: object
In [87]:
hotel_df = pd.merge(hotel_raw_df, country_code)
hotel_df
Out[87]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... alpha-2 alpha-3 country-code iso_3166-2 region sub-region intermediate-region region-code sub-region-code intermediate-region-code
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... PT PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... PT PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN
2 Resort Hotel 0 0 2015 July 27 1 0 2 2 ... PT PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN
3 Resort Hotel 0 9 2015 July 27 1 0 2 2 ... PT PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN
4 Resort Hotel 1 85 2015 July 27 1 0 3 2 ... PT PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
117615 City Hotel 0 6 2017 June 24 11 2 1 2 ... SD SDN 729 ISO 3166-2:SD Africa Northern Africa NaN 2.0 15.0 NaN
117616 City Hotel 0 116 2017 July 27 2 2 0 2 ... TF ATF 260 ISO 3166-2:TF Africa Sub-Saharan Africa Eastern Africa 2.0 202.0 14.0
117617 City Hotel 0 84 2017 July 30 27 2 5 2 ... SL SLE 694 ISO 3166-2:SL Africa Sub-Saharan Africa Western Africa 2.0 202.0 11.0
117618 City Hotel 0 2 2017 August 32 9 0 3 2 ... LA LAO 418 ISO 3166-2:LA Asia South-eastern Asia NaN 142.0 35.0 NaN
117619 City Hotel 0 3 2017 August 32 10 0 2 2 ... LA LAO 418 ISO 3166-2:LA Asia South-eastern Asia NaN 142.0 35.0 NaN

117620 rows × 43 columns

In [88]:
top_countries = hotel_df.name.value_counts().head(10)
top_countries
Out[88]:
Portugal                                                48590
United Kingdom of Great Britain and Northern Ireland    12129
France                                                  10415
Spain                                                    8568
Germany                                                  7287
Italy                                                    3766
Ireland                                                  3375
Belgium                                                  2342
Brazil                                                   2224
Netherlands                                              2104
Name: name, dtype: int64
In [89]:
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top 10 countries from where these hotels are receiving guests')
sns.barplot(x=top_countries.index, y=top_countries);

It appears that a disproportionately high number of bookings are from Portugal, probably because the hotel is located in Portugal itself. The second country is the United Kingdom which is approx. 75% behind.

In [90]:
#Country wise guest (Let’s look at the country-wise guests that both the hotels in total have received.)
country_wise_guests = hotel_df[hotel_df['is_canceled'] == 0]['name'].value_counts().reset_index()
country_wise_guests.columns = ['name', 'No of guests']
country_wise_guests
Out[90]:
name No of guests
0 Portugal 21071
1 United Kingdom of Great Britain and Northern I... 9676
2 France 8481
3 Spain 6391
4 Germany 6069
... ... ...
158 Bahamas 1
159 Tajikistan 1
160 Macao 1
161 Djibouti 1
162 Bahrain 1

163 rows × 2 columns

In [91]:
top_countries_guests = country_wise_guests.head(10)
top_countries_guests
Out[91]:
name No of guests
0 Portugal 21071
1 United Kingdom of Great Britain and Northern I... 9676
2 France 8481
3 Spain 6391
4 Germany 6069
5 Ireland 2543
6 Italy 2433
7 Belgium 1868
8 Netherlands 1717
9 United States of America 1596
In [92]:
plt.figure(figsize=(12,6))
plt.xticks(rotation=75)
plt.title('Top 10 countries from where these hotels are receiving guests')
sns.barplot(x=top_countries_guests.name, y=top_countries_guests['No of guests']);
In [93]:
resort_hotel = hotel_df.loc[(hotel_df["hotel"] == "Resort Hotel") & (hotel_df["is_canceled"] == 0)]
city_hotel = hotel_df.loc[(hotel_df["hotel"] == "City Hotel") & (hotel_df["is_canceled"] == 0)]
In [94]:
# Counting adults and children as paying guests only, not babies.
resort_hotel["adr_pp"] = resort_hotel["adr"] / (resort_hotel["adults"] + resort_hotel["children"])
/var/folders/qm/5x1g9wd51qv_hrcv4kk_g3f80000gn/T/ipykernel_1836/1572811521.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resort_hotel["adr_pp"] = resort_hotel["adr"] / (resort_hotel["adults"] + resort_hotel["children"])
In [95]:
import numpy as np
resort_hotel_filtered = resort_hotel.replace([np.inf, -np.inf], 0)
In [96]:
city_hotel["adr_pp"] = city_hotel["adr"] / (city_hotel["adults"] + city_hotel["children"])
city_hotel_filtered = city_hotel.replace([np.inf, -np.inf], 0)
/var/folders/qm/5x1g9wd51qv_hrcv4kk_g3f80000gn/T/ipykernel_1836/2552075327.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  city_hotel["adr_pp"] = city_hotel["adr"] / (city_hotel["adults"] + city_hotel["children"])
In [97]:
print("""From all non-cnceled bookings, across all room types and meals, the average prices are:
Resort hotel: {:.2f} € per night and person.
City hotel: {:.2f} € per night and person."""
      .format(resort_hotel_filtered["adr_pp"].mean(), city_hotel_filtered["adr_pp"].mean()))
From all non-cnceled bookings, across all room types and meals, the average prices are:
Resort hotel: 47.50 € per night and person.
City hotel: 59.26 € per night and person.
In [98]:
# normalize price per night (adr):

hotel_df["adr_pp"] = hotel_df["adr"] / (hotel_df["adults"] + hotel_df["children"])
hotel_df_guests = hotel_df.loc[hotel_df["is_canceled"] == 0]
hotel_df_guests
Out[98]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults ... alpha-3 country-code iso_3166-2 region sub-region intermediate-region region-code sub-region-code intermediate-region-code adr_pp
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 ... PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN 0.000
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 ... PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN 0.000
2 Resort Hotel 0 0 2015 July 27 1 0 2 2 ... PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN 53.500
3 Resort Hotel 0 9 2015 July 27 1 0 2 2 ... PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN 51.500
7 Resort Hotel 0 35 2015 July 27 1 0 4 2 ... PRT 620 ISO 3166-2:PT Europe Southern Europe NaN 150.0 39.0 NaN 72.500
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
117615 City Hotel 0 6 2017 June 24 11 2 1 2 ... SDN 729 ISO 3166-2:SD Africa Northern Africa NaN 2.0 15.0 NaN 49.500
117616 City Hotel 0 116 2017 July 27 2 2 0 2 ... ATF 260 ISO 3166-2:TF Africa Sub-Saharan Africa Eastern Africa 2.0 202.0 14.0 63.000
117617 City Hotel 0 84 2017 July 30 27 2 5 2 ... SLE 694 ISO 3166-2:SL Africa Sub-Saharan Africa Western Africa 2.0 202.0 11.0 60.715
117618 City Hotel 0 2 2017 August 32 9 0 3 2 ... LAO 418 ISO 3166-2:LA Asia South-eastern Asia NaN 142.0 35.0 NaN 91.665
117619 City Hotel 0 3 2017 August 32 10 0 2 2 ... LAO 418 ISO 3166-2:LA Asia South-eastern Asia NaN 142.0 35.0 NaN 90.000

73718 rows × 44 columns

In [99]:
# only actual gusts

room_prices = hotel_df_guests[["hotel", "reserved_room_type", "adr_pp"]].sort_values("reserved_room_type")
room_prices
Out[99]:
hotel reserved_room_type adr_pp
69507 City Hotel A 152.000000
65981 Resort Hotel A 14.500000
65982 Resort Hotel A 14.500000
96679 City Hotel A 49.500000
65984 Resort Hotel A 27.000000
... ... ... ...
13296 Resort Hotel H 64.666667
183 Resort Hotel L 37.500000
8934 Resort Hotel L 86.000000
8163 Resort Hotel L 82.000000
582 Resort Hotel L 100.000000

73718 rows × 3 columns

In [100]:
# boxplot:
plt.figure(figsize=(14, 10))
sns.boxplot(x="reserved_room_type",
            y="adr_pp",
            hue="hotel",
            data=room_prices, 
            hue_order=["City Hotel", "Resort Hotel"],
            fliersize=0)
plt.title("Price of room types per night and person", fontsize=16)
plt.xlabel("Room type", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 160)
plt.show()
In [101]:
# grab data:

room_prices_monthly = hotel_df_guests[["hotel", "arrival_date_month", "adr_pp"]].sort_values("arrival_date_month")
room_prices_monthly
Out[101]:
hotel arrival_date_month adr_pp
90695 City Hotel April 45.475
96661 City Hotel April 49.500
96660 City Hotel April 49.500
96659 City Hotel April 81.000
96658 City Hotel April 67.500
... ... ... ...
95721 City Hotel September 63.000
95722 City Hotel September 77.500
95723 City Hotel September 63.000
19345 City Hotel September 109.250
75196 Resort Hotel September 64.000

73718 rows × 3 columns

In [102]:
# order by month:

ordered_months = ["January", "February", "March", "April", "May", "June", "July", "August",
                  "September", "October", "November", "December"]
room_prices_monthly["arrival_date_month"] = pd.Categorical(room_prices_monthly["arrival_date_month"], categories=ordered_months, ordered=True)
In [103]:
# barplot with standard deviation:

plt.figure(figsize=(12, 8))
sns.lineplot(x = "arrival_date_month", y="adr_pp", hue="hotel", data=room_prices_monthly, 
            hue_order = ["City Hotel", "Resort Hotel"], ci="sd", size="hotel", sizes=(2, 2))
plt.title("Room price per night and person over the year", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=90)
plt.ylabel("Price [EUR]", fontsize=16)
plt.show()
In [104]:
#Q1: How many guests are there who have repeatedly canceled the bookings?
hotel_df['is_repeated_guest'] = hotel_df['is_repeated_guest'].replace(0,'No')
hotel_df['is_repeated_guest'] = hotel_df['is_repeated_guest'].replace(1,'Yes')
In [105]:
hotel_df['is_repeated_guest']
Out[105]:
0         No
1         No
2         No
3         No
4         No
          ..
117615    No
117616    No
117617    No
117618    No
117619    No
Name: is_repeated_guest, Length: 117620, dtype: object
In [106]:
sns.set(style = "whitegrid")
plt.title("Canceled or not?", fontdict = {'fontsize': 30})
canceled = sns.countplot(x = hotel_df.hotel, hue = 'is_repeated_guest', data = hotel_df)
In [107]:
#Q2: Which hotel people like to stay and spend more time? (We might see a skew distribution here, it is best that we check this on weekdays and weekends separately)
plt.figure(figsize = (15,10))
sns.boxplot(x = "market_segment", y = "stays_in_week_nights", data = hotel_df, hue = "hotel", palette = 'Set1');
In [108]:
plt.figure(figsize=(15,10))
sns.boxplot(x = "market_segment", y = "stays_in_weekend_nights", data = hotel_df, hue = "hotel", palette = 'Set1');
In [109]:
#Q3: What are the effects of deposit on cancellations by segments.

plt.figure(figsize = (15,10))
sns.set(style = "whitegrid")
plt.title("Countplot Distrubiton of Segment by Deposit Type", fontdict = {'fontsize':20})
ax = sns.countplot(x = "market_segment", hue = 'deposit_type', data = hotel_df)
In [110]:
plt.figure(figsize = (15,10))
sns.set(style = "darkgrid")
plt.title("Countplot Distributon of Segments by Cancellation", fontdict = {'fontsize':20})
ax = sns.countplot(x = "market_segment", hue = 'is_canceled', data = hotel_df)
In [111]:
#Q4: What is the relationship of lead time with cancellation.
hotel_df['is_canceled'] = hotel_df['is_canceled'].replace(0,'No')
hotel_df['is_canceled'] = hotel_df['is_canceled'].replace(1,'Yes')
In [112]:
(sns.FacetGrid(hotel_df, hue = 'is_canceled',
             height = 6,
             xlim = (0,500))
    .map(sns.kdeplot, 'lead_time', shade = True)
    .add_legend());
In [113]:
#Q5: What is the monthly customers each hotel is receiving?
plt.figure(figsize =(15,10))
sns.set(style="whitegrid")
plt.title("Total Customers - Monthly ", fontdict={'fontsize': 20})
ax = sns.countplot(x = "arrival_date_month", hue = 'hotel', data = hotel_df)
In [114]:
#Q6: Which are the most busy month?
# Create a DateFrame with the relevant data:
resort_guests_monthly = resort_hotel.groupby("arrival_date_month")["hotel"].count()
city_guests_monthly = city_hotel.groupby("arrival_date_month")["hotel"].count()

resort_guest_data = pd.DataFrame({"month": list(resort_guests_monthly.index),
                    "hotel": "Resort hotel", 
                    "guests": list(resort_guests_monthly.values)})

city_guest_data = pd.DataFrame({"month": list(city_guests_monthly.index),
                    "hotel": "City hotel", 
                    "guests": list(city_guests_monthly.values)})
full_guest_data = pd.concat([resort_guest_data,city_guest_data], ignore_index=True)
In [115]:
# order by month:
ordered_months = ["January", "February", "March", "April", "May", "June", 
          "July", "August", "September", "October", "November", "December"]
full_guest_data["month"] = pd.Categorical(full_guest_data["month"], categories=ordered_months, ordered=True)
In [116]:
# Dataset contains July and August date from 3 years, the other month from 2 years. Normalize data:
full_guest_data.loc[(full_guest_data["month"] == "July") | (full_guest_data["month"] == "August"),
                    "guests"] /= 3
full_guest_data.loc[~((full_guest_data["month"] == "July") | (full_guest_data["month"] == "August")),
                    "guests"] /= 2
In [117]:
#show figure:
plt.figure(figsize=(13, 8))
sns.lineplot(x = "month", y="guests", hue="hotel", data=full_guest_data, 
             hue_order = ["City hotel", "Resort hotel"], size="hotel", sizes=(2.5, 2.5))
plt.title("Average number of hotel guests per month", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.xticks(rotation=90)
plt.ylabel("Number of guests", fontsize=16)
plt.show()
In [118]:
# absolute cancelations:
total_cancelations = hotel_raw_df["is_canceled"].sum()
resort_cancelations = hotel_raw_df.loc[hotel_raw_df["hotel"] == "Resort Hotel"]["is_canceled"].sum()
city_cancelations = hotel_raw_df.loc[hotel_raw_df["hotel"] == "City Hotel"]["is_canceled"].sum()

# as percent:
rel_cancel = total_cancelations / hotel_df.shape[0] * 100
rh_rel_cancel = resort_cancelations / hotel_raw_df.loc[hotel_raw_df["hotel"] == "Resort Hotel"].shape[0] * 100
ch_rel_cancel = city_cancelations / hotel_raw_df.loc[hotel_raw_df["hotel"] == "City Hotel"].shape[0] * 100

print(f"Total bookings canceled: {total_cancelations:,} ({rel_cancel:.0f} %)")
print(f"Resort hotel bookings canceled: {resort_cancelations:,} ({rh_rel_cancel:.0f} %)")
print(f"City hotel bookings canceled: {city_cancelations:,} ({ch_rel_cancel:.0f} %)")
Total bookings canceled: 44,224 (38 %)
Resort hotel bookings canceled: 11,122 (28 %)
City hotel bookings canceled: 33,102 (42 %)